The section consists of various section of geo analysis of data
!pip3 install -U pip
!pip3 install -r requirements.txt
# !sudo apt install python3-dev
# !sudo apt-get install libproj-dev proj-data proj-bin
# !sudo apt-get install libgeos-dev
# !sudo apt-get install -y python3-pyproj
Requirement already satisfied: pip in /home/ali/anaconda3/lib/python3.8/site-packages (21.2.2) Requirement already satisfied: calmap in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 1)) (0.0.9) Requirement already satisfied: tweepy in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 2)) (3.10.0) Requirement already satisfied: pandas in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 3)) (1.2.4) Requirement already satisfied: geopy in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 4)) (2.1.0) Requirement already satisfied: tqdm in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 5)) (4.61.1) Requirement already satisfied: country_converter in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 6)) (0.7.3) Requirement already satisfied: pandarallel in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 7)) (1.5.2) Requirement already satisfied: swifter in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 8)) (1.0.7) Requirement already satisfied: seaborn in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 9)) (0.11.1) Requirement already satisfied: shapely in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 10)) (1.7.1) Requirement already satisfied: geopandas in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 11)) (0.9.0) Requirement already satisfied: cython in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 12)) (0.29.23) Requirement already satisfied: geoplot in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 13)) (0.4.1) Requirement already satisfied: folium in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 14)) (0.12.1) Requirement already satisfied: plotly in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 15)) (4.14.3) Requirement already satisfied: matplotlib in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 16)) (3.4.2) Requirement already satisfied: iso3166 in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 17)) (1.0.1) Requirement already satisfied: pymongo in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 18)) (3.11.4) Requirement already satisfied: wordcloud in /home/ali/.local/lib/python3.8/site-packages (from -r requirements.txt (line 19)) (1.8.1) Requirement already satisfied: textblob in /home/ali/anaconda3/lib/python3.8/site-packages (from -r requirements.txt (line 20)) (0.15.3) Requirement already satisfied: covid_daily in /home/ali/anaconda3/lib/python3.8/site-packages (from -r requirements.txt (line 21)) (1.3.0) Requirement already satisfied: numpy in /home/ali/anaconda3/lib/python3.8/site-packages (from calmap->-r requirements.txt (line 1)) (1.19.5) Requirement already satisfied: requests[socks]>=2.11.1 in /home/ali/anaconda3/lib/python3.8/site-packages (from tweepy->-r requirements.txt (line 2)) (2.25.1) Requirement already satisfied: six>=1.10.0 in /home/ali/anaconda3/lib/python3.8/site-packages (from tweepy->-r requirements.txt (line 2)) (1.15.0) Requirement already satisfied: requests-oauthlib>=0.7.0 in /home/ali/.local/lib/python3.8/site-packages (from tweepy->-r requirements.txt (line 2)) (1.3.0) Requirement already satisfied: python-dateutil>=2.7.3 in /home/ali/anaconda3/lib/python3.8/site-packages (from pandas->-r requirements.txt (line 3)) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /home/ali/anaconda3/lib/python3.8/site-packages (from pandas->-r requirements.txt (line 3)) (2021.1) Requirement already satisfied: geographiclib<2,>=1.49 in /home/ali/.local/lib/python3.8/site-packages (from geopy->-r requirements.txt (line 4)) (1.50) Requirement already satisfied: dill in /home/ali/.local/lib/python3.8/site-packages (from pandarallel->-r requirements.txt (line 7)) (0.3.4) Requirement already satisfied: ipywidgets>=7.0.0cloudpickle>=0.2.2 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (7.6.3) Requirement already satisfied: dask[dataframe]>=2.10.0 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (2021.6.0) Requirement already satisfied: parso>0.4.0 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (0.8.2) Requirement already satisfied: modin[ray]>=0.8.1.1 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (0.10.0) Requirement already satisfied: psutil>=5.6.6 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (5.8.0) Requirement already satisfied: bleach>=3.1.1 in /home/ali/.local/lib/python3.8/site-packages (from swifter->-r requirements.txt (line 8)) (3.3.0) Requirement already satisfied: scipy>=1.0 in /home/ali/.local/lib/python3.8/site-packages (from seaborn->-r requirements.txt (line 9)) (1.6.3) Requirement already satisfied: pyproj>=2.2.0 in /home/ali/.local/lib/python3.8/site-packages (from geopandas->-r requirements.txt (line 11)) (3.1.0) Requirement already satisfied: fiona>=1.8 in /home/ali/.local/lib/python3.8/site-packages (from geopandas->-r requirements.txt (line 11)) (1.8.20) Requirement already satisfied: cartopy in /home/ali/.local/lib/python3.8/site-packages (from geoplot->-r requirements.txt (line 13)) (0.19.0.post1) Requirement already satisfied: descartes in /home/ali/.local/lib/python3.8/site-packages (from geoplot->-r requirements.txt (line 13)) (1.1.0) Requirement already satisfied: contextily>=1.0.0 in /home/ali/.local/lib/python3.8/site-packages (from geoplot->-r requirements.txt (line 13)) (1.1.0) Requirement already satisfied: mapclassify>=2.1 in /home/ali/.local/lib/python3.8/site-packages (from geoplot->-r requirements.txt (line 13)) (2.4.2) Requirement already satisfied: branca>=0.3.0 in /home/ali/.local/lib/python3.8/site-packages (from folium->-r requirements.txt (line 14)) (0.4.2) Requirement already satisfied: jinja2>=2.9 in /home/ali/.local/lib/python3.8/site-packages (from folium->-r requirements.txt (line 14)) (3.0.1) Requirement already satisfied: retrying>=1.3.3 in /home/ali/.local/lib/python3.8/site-packages (from plotly->-r requirements.txt (line 15)) (1.3.3) Requirement already satisfied: kiwisolver>=1.0.1 in /home/ali/.local/lib/python3.8/site-packages (from matplotlib->-r requirements.txt (line 16)) (1.3.1) Requirement already satisfied: pyparsing>=2.2.1 in /home/ali/.local/lib/python3.8/site-packages (from matplotlib->-r requirements.txt (line 16)) (2.4.7) Requirement already satisfied: cycler>=0.10 in /home/ali/.local/lib/python3.8/site-packages (from matplotlib->-r requirements.txt (line 16)) (0.10.0) Requirement already satisfied: pillow>=6.2.0 in /home/ali/anaconda3/lib/python3.8/site-packages (from matplotlib->-r requirements.txt (line 16)) (8.2.0) Requirement already satisfied: nltk>=3.1 in /home/ali/anaconda3/lib/python3.8/site-packages (from textblob->-r requirements.txt (line 20)) (3.6.1) Requirement already satisfied: lxml in /home/ali/anaconda3/lib/python3.8/site-packages (from covid_daily->-r requirements.txt (line 21)) (4.6.3) Requirement already satisfied: Unidecode in /home/ali/anaconda3/lib/python3.8/site-packages (from covid_daily->-r requirements.txt (line 21)) (1.2.0) Requirement already satisfied: packaging in /home/ali/.local/lib/python3.8/site-packages (from bleach>=3.1.1->swifter->-r requirements.txt (line 8)) (20.9) Requirement already satisfied: webencodings in /home/ali/.local/lib/python3.8/site-packages (from bleach>=3.1.1->swifter->-r requirements.txt (line 8)) (0.5.1) Requirement already satisfied: mercantile in /home/ali/.local/lib/python3.8/site-packages (from contextily>=1.0.0->geoplot->-r requirements.txt (line 13)) (1.2.1) Requirement already satisfied: rasterio in /home/ali/.local/lib/python3.8/site-packages (from contextily>=1.0.0->geoplot->-r requirements.txt (line 13)) (1.2.4) Requirement already satisfied: joblib in /home/ali/.local/lib/python3.8/site-packages (from contextily>=1.0.0->geoplot->-r requirements.txt (line 13)) (1.0.1) Requirement already satisfied: pyyaml in /home/ali/anaconda3/lib/python3.8/site-packages (from dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (5.4.1) Requirement already satisfied: partd>=0.3.10 in /home/ali/.local/lib/python3.8/site-packages (from dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (1.2.0) Requirement already satisfied: fsspec>=0.6.0 in /home/ali/.local/lib/python3.8/site-packages (from dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (2021.6.0) Requirement already satisfied: toolz>=0.8.2 in /home/ali/.local/lib/python3.8/site-packages (from dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (0.11.1) Requirement already satisfied: cloudpickle>=1.1.1 in /home/ali/.local/lib/python3.8/site-packages (from dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (1.6.0) Requirement already satisfied: attrs>=17 in /home/ali/.local/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (21.2.0) Requirement already satisfied: setuptools in /home/ali/anaconda3/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (52.0.0.post20210125) Requirement already satisfied: munch in /home/ali/.local/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (2.5.0) Requirement already satisfied: click>=4.0 in /home/ali/anaconda3/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (7.1.2) Requirement already satisfied: certifi in /home/ali/anaconda3/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (2020.12.5) Requirement already satisfied: click-plugins>=1.0 in /home/ali/.local/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (1.1.1) Requirement already satisfied: cligj>=0.5 in /home/ali/.local/lib/python3.8/site-packages (from fiona>=1.8->geopandas->-r requirements.txt (line 11)) (0.7.2) Requirement already satisfied: traitlets>=4.3.1 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (5.0.5) Requirement already satisfied: ipython>=4.0.0 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (7.24.1) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.0.0) Requirement already satisfied: widgetsnbextension~=3.5.0 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (3.5.1) Requirement already satisfied: ipykernel>=4.5.1 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (5.5.5) Requirement already satisfied: nbformat>=4.2.0 in /home/ali/.local/lib/python3.8/site-packages (from ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (5.1.3) Requirement already satisfied: jupyter-client in /home/ali/.local/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (6.1.12) Requirement already satisfied: tornado>=4.2 in /home/ali/.local/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (6.1) Requirement already satisfied: pickleshare in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.7.5) Requirement already satisfied: decorator in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (4.4.2) Requirement already satisfied: jedi>=0.16 in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.18.0) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (3.0.18) Requirement already satisfied: backcall in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.2.0) Requirement already satisfied: pexpect>4.3 in /home/ali/anaconda3/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (4.8.0) Requirement already satisfied: matplotlib-inline in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.1.2) Requirement already satisfied: pygments in /home/ali/.local/lib/python3.8/site-packages (from ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (2.9.0) Requirement already satisfied: MarkupSafe>=2.0 in /home/ali/.local/lib/python3.8/site-packages (from jinja2>=2.9->folium->-r requirements.txt (line 14)) (2.0.1) Requirement already satisfied: scikit-learn in /home/ali/.local/lib/python3.8/site-packages (from mapclassify>=2.1->geoplot->-r requirements.txt (line 13)) (0.24.2) Requirement already satisfied: networkx in /home/ali/.local/lib/python3.8/site-packages (from mapclassify>=2.1->geoplot->-r requirements.txt (line 13)) (2.5.1) Requirement already satisfied: pyarrow>=1.0 in /home/ali/.local/lib/python3.8/site-packages (from modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (4.0.1) Requirement already satisfied: ray>=1.4.0 in /home/ali/.local/lib/python3.8/site-packages (from modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.4.0) Requirement already satisfied: jupyter-core in /home/ali/.local/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (4.7.1) Requirement already satisfied: ipython-genutils in /home/ali/.local/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.2.0) Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /home/ali/.local/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (3.2.0) Requirement already satisfied: pyrsistent>=0.14.0 in /home/ali/.local/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.17.3) Requirement already satisfied: regex in /home/ali/anaconda3/lib/python3.8/site-packages (from nltk>=3.1->textblob->-r requirements.txt (line 20)) (2021.4.4) Requirement already satisfied: locket in /home/ali/.local/lib/python3.8/site-packages (from partd>=0.3.10->dask[dataframe]>=2.10.0->swifter->-r requirements.txt (line 8)) (0.2.1) Requirement already satisfied: ptyprocess>=0.5 in /home/ali/.local/lib/python3.8/site-packages (from pexpect>4.3->ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.7.0) Requirement already satisfied: wcwidth in /home/ali/.local/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.2.5) Requirement already satisfied: aiohttp in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.7.4.post0) Requirement already satisfied: pydantic>=1.8 in /home/ali/anaconda3/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.8.2) Requirement already satisfied: filelock in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.0.12) Requirement already satisfied: protobuf>=3.15.3 in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.17.3) Requirement already satisfied: py-spy>=0.2.0 in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.3.7) Requirement already satisfied: prometheus-client>=0.7.1 in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.11.0) Requirement already satisfied: redis>=3.5.0 in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.5.3) Requirement already satisfied: gpustat in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.6.0) Requirement already satisfied: colorama in /home/ali/anaconda3/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.4.4) Requirement already satisfied: aioredis in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.3.1) Requirement already satisfied: aiohttp-cors in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.7.0) Requirement already satisfied: grpcio>=1.28.1 in /home/ali/anaconda3/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.34.1) Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.0.2) Requirement already satisfied: opencensus in /home/ali/.local/lib/python3.8/site-packages (from ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.7.13) Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ali/anaconda3/lib/python3.8/site-packages (from pydantic>=1.8->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.7.4.3) Requirement already satisfied: oauthlib>=3.0.0 in /home/ali/anaconda3/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->tweepy->-r requirements.txt (line 2)) (3.1.1) Requirement already satisfied: idna<3,>=2.5 in /home/ali/anaconda3/lib/python3.8/site-packages (from requests[socks]>=2.11.1->tweepy->-r requirements.txt (line 2)) (2.10) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ali/anaconda3/lib/python3.8/site-packages (from requests[socks]>=2.11.1->tweepy->-r requirements.txt (line 2)) (1.26.4) Requirement already satisfied: chardet<5,>=3.0.2 in /home/ali/anaconda3/lib/python3.8/site-packages (from requests[socks]>=2.11.1->tweepy->-r requirements.txt (line 2)) (4.0.0) Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/ali/.local/lib/python3.8/site-packages (from requests[socks]>=2.11.1->tweepy->-r requirements.txt (line 2)) (1.7.1) Requirement already satisfied: notebook>=4.4.1 in /home/ali/.local/lib/python3.8/site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (6.4.0) Requirement already satisfied: argon2-cffi in /home/ali/.local/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (20.1.0) Requirement already satisfied: pyzmq>=17 in /home/ali/.local/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (22.1.0) Requirement already satisfied: terminado>=0.8.3 in /home/ali/.local/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.10.1) Requirement already satisfied: nbconvert in /home/ali/.local/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (6.0.7) Requirement already satisfied: Send2Trash>=1.5.0 in /home/ali/.local/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.5.0) Requirement already satisfied: async-timeout<4.0,>=3.0 in /home/ali/.local/lib/python3.8/site-packages (from aiohttp->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (3.0.1) Requirement already satisfied: multidict<7.0,>=4.5 in /home/ali/.local/lib/python3.8/site-packages (from aiohttp->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (5.1.0) Requirement already satisfied: yarl<2.0,>=1.0 in /home/ali/.local/lib/python3.8/site-packages (from aiohttp->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.6.3) Requirement already satisfied: hiredis in /home/ali/.local/lib/python3.8/site-packages (from aioredis->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (2.0.0) Requirement already satisfied: cffi>=1.0.0 in /home/ali/.local/lib/python3.8/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.14.5) Requirement already satisfied: pycparser in /home/ali/.local/lib/python3.8/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (2.20) Requirement already satisfied: pyshp>=2 in /home/ali/.local/lib/python3.8/site-packages (from cartopy->geoplot->-r requirements.txt (line 13)) (2.1.3) Requirement already satisfied: blessings>=1.6 in /home/ali/.local/lib/python3.8/site-packages (from gpustat->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.7) Requirement already satisfied: nvidia-ml-py3>=7.352.0 in /home/ali/.local/lib/python3.8/site-packages (from gpustat->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (7.352.0) Requirement already satisfied: defusedxml in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.7.1) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.5.3) Requirement already satisfied: entrypoints>=0.2.2 in /home/ali/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.3) Requirement already satisfied: jupyterlab-pygments in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.1.2) Requirement already satisfied: pandocfilters>=1.4.1 in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.4.3) Requirement already satisfied: testpath in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.5.0) Requirement already satisfied: mistune<2,>=0.8.1 in /home/ali/.local/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (0.8.4) Requirement already satisfied: async-generator in /home/ali/.local/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.10) Requirement already satisfied: nest-asyncio in /home/ali/.local/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0cloudpickle>=0.2.2->swifter->-r requirements.txt (line 8)) (1.5.1) Requirement already satisfied: google-api-core<2.0.0,>=1.0.0 in /home/ali/.local/lib/python3.8/site-packages (from opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.30.0) Requirement already satisfied: opencensus-context==0.1.2 in /home/ali/.local/lib/python3.8/site-packages (from opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.1.2) Requirement already satisfied: google-auth<2.0dev,>=1.25.0 in /home/ali/.local/lib/python3.8/site-packages (from google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.31.0) Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /home/ali/.local/lib/python3.8/site-packages (from google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (1.53.0) Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/ali/.local/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.2.8) Requirement already satisfied: rsa<5,>=3.1.4 in /home/ali/.local/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (4.7.2) Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/ali/.local/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (4.2.2) Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /home/ali/.local/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->google-api-core<2.0.0,>=1.0.0->opencensus->ray>=1.4.0->modin[ray]>=0.8.1.1->swifter->-r requirements.txt (line 8)) (0.4.8) Requirement already satisfied: snuggs>=1.4.1 in /home/ali/.local/lib/python3.8/site-packages (from rasterio->contextily>=1.0.0->geoplot->-r requirements.txt (line 13)) (1.4.7) Requirement already satisfied: affine in /home/ali/.local/lib/python3.8/site-packages (from rasterio->contextily>=1.0.0->geoplot->-r requirements.txt (line 13)) (2.3.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /home/ali/.local/lib/python3.8/site-packages (from scikit-learn->mapclassify>=2.1->geoplot->-r requirements.txt (line 13)) (2.1.0)
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
from collections import Counter
from datetime import date
import os
import datetime
from sklearn import preprocessing
import json
import seaborn as sns
import geopandas as gpd
from nltk.corpus import stopwords
import geoplot
from geopy import Nominatim
import folium
from similarity.cosine import Cosine
import mapclassify
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objs as go
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from mpl_toolkits.axes_grid1 import make_axes_locatable
from folium.plugins import HeatMapWithTime, TimestampedGeoJson
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from iso3166 import countries
import calmap
from geopy.geocoders import Nominatim
from tqdm import tqdm
import country_converter as coco
import time
import pymongo
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import matplotlib.ticker as ticker
# For Parallel Processing
import swifter
from pandarallel import pandarallel
pandarallel.initialize()
import matplotlib.style as style
style.use('fivethirtyeight')
INFO: Pandarallel will run on 8 workers. INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
np.random.seed(sum(map(ord, 'calmap')))
mongo_client = pymongo.MongoClient()
mongo_db = mongo_client["twitter"]
mongo_collection = mongo_db["tweets"]
# Load world cities dataset
cities = pd.read_csv('worldcities.csv')
print(cities.shape)
# Load tweets dataset from MongoDB
# full_dataframe = pd.DataFrame(mongo_collection.find()[:50000])
# Load old tweets dataset from csv file
# full_dataframe = pd.read_csv("All_Tweets.csv")
full_dataframe = pd.read_csv("covid19_tweets.csv")
# full_dataframe = full_dataframe[["Text"]]
# full_dataframe.columns = ["text"]
full_dataframe.shape
(15493, 11)
(179108, 13)
full_dataframe.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ᏉᎥ☻լꂅϮ | astroworld | wednesday addams as a disney princess keepin i... | 2017-05-26 05:46:42 | 624 | 950 | 18775 | False | 2020-07-25 12:27:21 | If I smelled the scent of hand sanitizers toda... | NaN | Twitter for iPhone | False |
| 1 | Tom Basile 🇺🇸 | New York, NY | Husband, Father, Columnist & Commentator. Auth... | 2009-04-16 20:06:23 | 2253 | 1677 | 24 | True | 2020-07-25 12:27:17 | Hey @Yankees @YankeesPR and @MLB - wouldn't it... | NaN | Twitter for Android | False |
| 2 | Time4fisticuffs | Pewee Valley, KY | #Christian #Catholic #Conservative #Reagan #Re... | 2009-02-28 18:57:41 | 9275 | 9525 | 7254 | False | 2020-07-25 12:27:14 | @diane3443 @wdunlap @realDonaldTrump Trump nev... | ['COVID19'] | Twitter for Android | False |
| 3 | ethel mertz | Stuck in the Middle | #Browns #Indians #ClevelandProud #[]_[] #Cavs ... | 2019-03-07 01:45:06 | 197 | 987 | 1488 | False | 2020-07-25 12:27:10 | @brookbanktv The one gift #COVID19 has give me... | ['COVID19'] | Twitter for iPhone | False |
| 4 | DIPR-J&K | Jammu and Kashmir | 🖊️Official Twitter handle of Department of Inf... | 2017-02-12 06:45:15 | 101009 | 168 | 101 | False | 2020-07-25 12:27:08 | 25 July : Media Bulletin on Novel #CoronaVirus... | ['CoronaVirusUpdates', 'COVID19'] | Twitter for Android | False |
def filter_tweet(tweet):
new_tweet = {}
new_tweet['user_name'] = tweet['user']['screen_name']
new_tweet['user_location'] = tweet['user']['location']
new_tweet['hashtags'] = tweet['entities']['hashtags']
new_tweet['is_retweet'] = tweet['retweeted']
new_tweet['source'] = tweet['source']
new_tweet['user_description'] = tweet['user']['description']
new_tweet['user_created'] = tweet['user']['created_at']
new_tweet['user_followers'] = tweet['user']['followers_count']
new_tweet['user_friends'] = tweet['user']['friends_count']
new_tweet['user_favourites'] = tweet['user']['favourites_count']
new_tweet['user_verified'] = tweet['user']['verified']
new_tweet['date'] = tweet['created_at']
new_tweet['text'] = tweet['full_text']
new_tweet['tweet_id'] = tweet['id']
if tweet['coordinates']:
new_tweet['lat'] = tweet['coordinates']['coordinates'][0]
new_tweet['long'] = tweet['coordinates']['coordinates'][1]
else:
new_tweet['lat'] = 'None'
new_tweet['long'] = 'None'
new_tweet['geo_enabled'] = tweet['user']['geo_enabled']
new_tweet['tweet_place'] = tweet['place']
return new_tweet
# rows = [filter_tweet(row) for index, row in full_dataframe.iterrows()]
# full_dataframe = pd.DataFrame(rows)
# df = full_dataframe.drop_duplicates(subset='tweet_id', keep="last")
df = full_dataframe.drop_duplicates(subset='text', keep="last")
del full_dataframe
df.reset_index(inplace=True)
print(df.shape)
df.head()
(178683, 14)
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ᏉᎥ☻լꂅϮ | astroworld | wednesday addams as a disney princess keepin i... | 2017-05-26 05:46:42 | 624 | 950 | 18775 | False | 2020-07-25 12:27:21 | If I smelled the scent of hand sanitizers toda... | NaN | Twitter for iPhone | False |
| 1 | 1 | Tom Basile 🇺🇸 | New York, NY | Husband, Father, Columnist & Commentator. Auth... | 2009-04-16 20:06:23 | 2253 | 1677 | 24 | True | 2020-07-25 12:27:17 | Hey @Yankees @YankeesPR and @MLB - wouldn't it... | NaN | Twitter for Android | False |
| 2 | 2 | Time4fisticuffs | Pewee Valley, KY | #Christian #Catholic #Conservative #Reagan #Re... | 2009-02-28 18:57:41 | 9275 | 9525 | 7254 | False | 2020-07-25 12:27:14 | @diane3443 @wdunlap @realDonaldTrump Trump nev... | ['COVID19'] | Twitter for Android | False |
| 3 | 3 | ethel mertz | Stuck in the Middle | #Browns #Indians #ClevelandProud #[]_[] #Cavs ... | 2019-03-07 01:45:06 | 197 | 987 | 1488 | False | 2020-07-25 12:27:10 | @brookbanktv The one gift #COVID19 has give me... | ['COVID19'] | Twitter for iPhone | False |
| 4 | 4 | DIPR-J&K | Jammu and Kashmir | 🖊️Official Twitter handle of Department of Inf... | 2017-02-12 06:45:15 | 101009 | 168 | 101 | False | 2020-07-25 12:27:08 | 25 July : Media Bulletin on Novel #CoronaVirus... | ['CoronaVirusUpdates', 'COVID19'] | Twitter for Android | False |
row_num, col_num = df.shape
print(f'There are {row_num} rows and {col_num} columns')
There are 178683 rows and 14 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178683 entries, 0 to 178682 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 178683 non-null int64 1 user_name 178683 non-null object 2 user_location 142126 non-null object 3 user_description 168412 non-null object 4 user_created 178683 non-null object 5 user_followers 178683 non-null int64 6 user_friends 178683 non-null int64 7 user_favourites 178683 non-null int64 8 user_verified 178683 non-null bool 9 date 178683 non-null object 10 text 178683 non-null object 11 hashtags 127349 non-null object 12 source 178606 non-null object 13 is_retweet 178683 non-null bool dtypes: bool(2), int64(4), object(8) memory usage: 16.7+ MB
df.describe()
| index | user_followers | user_friends | user_favourites | |
|---|---|---|---|---|
| count | 178683.000000 | 1.786830e+05 | 178683.000000 | 1.786830e+05 |
| mean | 89589.983395 | 1.092361e+05 | 2124.155756 | 1.446586e+04 |
| std | 51700.752531 | 8.422506e+05 | 9171.461235 | 4.456473e+04 |
| min | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 |
| 25% | 44829.500000 | 1.740000e+02 | 149.000000 | 2.080000e+02 |
| 50% | 89579.000000 | 9.960000e+02 | 543.000000 | 1.800000e+03 |
| 75% | 134371.500000 | 5.295000e+03 | 1726.000000 | 9.412000e+03 |
| max | 179107.000000 | 4.944256e+07 | 497363.000000 | 2.047197e+06 |
Removing Mising Values
df["country"] = np.NaN
user_location = df['user_location'].fillna(value='').str.split(',')
user_location.head()
0 [astroworld] 1 [New York, NY] 2 [Pewee Valley, KY] 3 [Stuck in the Middle ] 4 [Jammu and Kashmir] Name: user_location, dtype: object
lat = cities['lat'].fillna(value = '').values.tolist()
lng = cities['lng'].fillna(value = '').values.tolist()
country = cities['country'].fillna(value = '').values.tolist()
# Getting all alpha 3 codes into a list
world_city_iso3 = []
for c in cities['iso3'].str.lower().str.strip().values.tolist():
if c not in world_city_iso3:
world_city_iso3.append(c)
# Getting all alpha 2 codes into a list
world_city_iso2 = []
for c in cities['iso2'].str.lower().str.strip().values.tolist():
if c not in world_city_iso2:
world_city_iso2.append(c)
# Getting all countries into a list
world_city_country = []
for c in cities['country'].str.lower().str.strip().values.tolist():
if c not in world_city_country:
world_city_country.append(c)
# Getting all amdin names into a list
world_states = []
for c in cities['admin_name'].str.lower().str.strip().tolist():
world_states.append(c)
# Getting all cities into a list
world_city = cities['city'].fillna(value = '').str.lower().str.strip().values.tolist()
world_city[:10]
['tokyo', 'new york', 'mexico city', 'mumbai', 'são paulo', 'delhi', 'shanghai', 'kolkata', 'los angeles', 'dhaka']
for each_loc in range(len(user_location)):
ind = each_loc
each_loc = user_location[each_loc]
for each in each_loc:
each = each.lower().strip()
if each in world_city:
order = world_city.index(each)
df['country'][ind] = country[order].lower()
continue
if each in world_states:
order= world_states.index(each)
df['country'][ind] = country[order].lower()
continue
if each in world_city_country:
order = world_city_country.index(each)
df['country'][ind] = world_city_country[order].lower()
continue
if each in world_city_iso2:
order = world_city_iso2.index(each)
df['country'][ind] = world_city_country[order].lower()
continue
if each in world_city_iso3:
order = world_city_iso3.index(each)
df['country'][ind] = world_city_country[order].lower()
continue
<ipython-input-13-4ecfff0ab7fb>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/ali/.local/lib/python3.8/site-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-13-4ecfff0ab7fb>:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-13-4ecfff0ab7fb>:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-13-4ecfff0ab7fb>:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-13-4ecfff0ab7fb>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
print(df.shape)
df.head()
(178683, 15)
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ᏉᎥ☻լꂅϮ | astroworld | wednesday addams as a disney princess keepin i... | 2017-05-26 05:46:42 | 624 | 950 | 18775 | False | 2020-07-25 12:27:21 | If I smelled the scent of hand sanitizers toda... | NaN | Twitter for iPhone | False | NaN |
| 1 | 1 | Tom Basile 🇺🇸 | New York, NY | Husband, Father, Columnist & Commentator. Auth... | 2009-04-16 20:06:23 | 2253 | 1677 | 24 | True | 2020-07-25 12:27:17 | Hey @Yankees @YankeesPR and @MLB - wouldn't it... | NaN | Twitter for Android | False | united states |
| 2 | 2 | Time4fisticuffs | Pewee Valley, KY | #Christian #Catholic #Conservative #Reagan #Re... | 2009-02-28 18:57:41 | 9275 | 9525 | 7254 | False | 2020-07-25 12:27:14 | @diane3443 @wdunlap @realDonaldTrump Trump nev... | ['COVID19'] | Twitter for Android | False | cayman islands |
| 3 | 3 | ethel mertz | Stuck in the Middle | #Browns #Indians #ClevelandProud #[]_[] #Cavs ... | 2019-03-07 01:45:06 | 197 | 987 | 1488 | False | 2020-07-25 12:27:10 | @brookbanktv The one gift #COVID19 has give me... | ['COVID19'] | Twitter for iPhone | False | NaN |
| 4 | 4 | DIPR-J&K | Jammu and Kashmir | 🖊️Official Twitter handle of Department of Inf... | 2017-02-12 06:45:15 | 101009 | 168 | 101 | False | 2020-07-25 12:27:08 | 25 July : Media Bulletin on Novel #CoronaVirus... | ['CoronaVirusUpdates', 'COVID19'] | Twitter for Android | False | NaN |
import datetime
print("Current date:", datetime.date.today())
# saving datafrme
df.to_csv("covid_tweets_{}.csv".format(datetime.date.today()), index=False)
Current date: 2021-08-07
print('Total Number of valid Tweets Available: ', df['country'].isnull().sum())
Total Number of valid Tweets Available: 77628
tweet_per_country = df['country'].str.lower().dropna()
tw = tweet_per_country.value_counts().rename_axis('Country').reset_index(name='Tweet Count')
print(tw)
plt.rcParams['figure.figsize'] = (15,10)
plt.title('Top 10 Countries with Most Tweets',fontsize=15)
sns.set_palette("husl")
ax = sns.barplot(y=tw['Country'].head(10),x=tw['Tweet Count'].head(10))
Country Tweet Count 0 united states 28848 1 india 18372 2 united kingdom 9229 3 canada 7562 4 australia 4110 .. ... ... 188 sao tome and principe 1 189 tonga 1 190 eritrea 1 191 armenia 1 192 lithuania 1 [193 rows x 2 columns]
tweet_per_country = df['country'].str.lower().dropna()
tw = tweet_per_country.value_counts().rename_axis('Country').reset_index(name='Tweet Count')
print(tw)
plt.rcParams['figure.figsize'] = (15,10)
plt.title('10 Countries with Least Tweets',fontsize=15)
sns.set_palette("husl")
ax = sns.barplot(y=tw['Country'][-9:],x=tw['Tweet Count'][-9:])
Country Tweet Count 0 united states 28848 1 india 18372 2 united kingdom 9229 3 canada 7562 4 australia 4110 .. ... ... 188 sao tome and principe 1 189 tonga 1 190 eritrea 1 191 armenia 1 192 lithuania 1 [193 rows x 2 columns]
print(df["date"].min())
print(df["date"].max())
2020-07-24 23:47:08 2020-08-30 09:07:39
country_graph_03 = px.bar(x='Tweet Count',y='Country',data_frame=tw[:15],color='Country')
country_graph_03.show()
Let's see percent of NaNs for every column. We will visualize only columns with at least 1 missed value.
missed = pd.DataFrame()
missed['column'] = df.columns
missed['percent'] = [round(100* df[col].isnull().sum() / len(df), 2) for col in df.columns]
missed = missed.sort_values('percent')
missed = missed[missed['percent']>0]
fig = px.bar(
missed,
x='percent',
y="column",
orientation='h',
title='Missed values percent for every column (percent > 0)',
height=400,
width=600
)
fig.show()
Let's see top 40 users by number of tweets.
ds = df['user_name'].value_counts().reset_index()
ds.columns = ['user_name', 'tweets_count']
ds = ds.sort_values(['tweets_count'])
fig = px.bar(
ds.tail(40),
x="tweets_count",
y="user_name",
orientation='h',
title='Top 40 users by number of tweets',
width=800,
height=800
)
fig.show()
df = pd.merge(df, ds, on='user_name')
Let's see most popular users.
data = df.sort_values('user_followers', ascending=False)
data = data.drop_duplicates(subset='user_name', keep="first")
data = data[['user_name', 'user_followers', 'tweets_count']]
data = data.sort_values('user_followers')
fig = px.bar(
data.tail(40),
x="user_followers",
y="user_name",
color='tweets_count',
orientation='h',
title='Top 40 users by number of followers',
width=800,
height=800
)
fig.show()
And most friendly users.
data = df.sort_values('user_friends', ascending=False)
data = data.drop_duplicates(subset='user_name', keep="first")
data = data[['user_name', 'user_friends', 'tweets_count']]
data = data.sort_values('user_friends')
fig = px.bar(
data.tail(40),
x="user_friends",
y="user_name",
color = 'tweets_count',
orientation='h',
title='Top 40 users by number of friends',
width=800,
height=800
)
fig.show()
Let's see how coronavirus affect to new users creation.
df['user_created'] = pd.to_datetime(df['user_created'])
df['year_created'] = df['user_created'].dt.year
data = df.drop_duplicates(subset='user_name', keep="first")
data = data[data['year_created']>1970]
data = data['year_created'].value_counts().reset_index()
data.columns = ['year', 'number']
fig = px.bar(
data,
x="year",
y="number",
orientation='v',
title='User created year by year',
width=800,
height=600
)
fig.show()
As we can see from chart coronavirus increases the number of new twitter users.
df.head()
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | country | tweets_count | year_created | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ᏉᎥ☻լꂅϮ | astroworld | wednesday addams as a disney princess keepin i... | 2017-05-26 05:46:42 | 624 | 950 | 18775 | False | 2020-07-25 12:27:21 | If I smelled the scent of hand sanitizers toda... | NaN | Twitter for iPhone | False | NaN | 2 | 2017 |
| 1 | 61 | ᏉᎥ☻լꂅϮ | astroworld | wednesday addams as a disney princess keepin i... | 2017-05-26 05:46:42 | 624 | 950 | 18775 | False | 2020-07-25 12:25:24 | I miss isopropyl alcohol so much!!!! Ethanol i... | ['COVID19'] | Twitter for iPhone | False | NaN | 2 | 2017 |
| 2 | 1 | Tom Basile 🇺🇸 | New York, NY | Husband, Father, Columnist & Commentator. Auth... | 2009-04-16 20:06:23 | 2253 | 1677 | 24 | True | 2020-07-25 12:27:17 | Hey @Yankees @YankeesPR and @MLB - wouldn't it... | NaN | Twitter for Android | False | united states | 1 | 2009 |
| 3 | 2 | Time4fisticuffs | Pewee Valley, KY | #Christian #Catholic #Conservative #Reagan #Re... | 2009-02-28 18:57:41 | 9275 | 9525 | 7254 | False | 2020-07-25 12:27:14 | @diane3443 @wdunlap @realDonaldTrump Trump nev... | ['COVID19'] | Twitter for Android | False | cayman islands | 7 | 2009 |
| 4 | 11446 | Time4fisticuffs | Pewee Valley, KY | #Christian #Catholic #Conservative #Reagan #Re... | 2009-02-28 18:57:41 | 9276 | 9526 | 7256 | False | 2020-07-25 04:32:59 | Seen enough "errors" to know the #COVID19 flar... | ['COVID19'] | Twitter for Android | False | cayman islands | 7 | 2009 |
Let's see top 40 most popular locations by the number of tweets.
ds = df['user_location'].value_counts().reset_index()
ds.columns = ['user_location', 'count']
ds = ds[ds['user_location']!='NA']
ds = ds.sort_values(['count'])
fig = px.bar(
ds.tail(40),
x="count",
y="user_location",
orientation='h', title='Top 40 user locations by number of tweets',
width=800,
height=800
)
fig.show()
And also we can see the pie plot for the full picture about users locations.
def pie_count(data, field, percent_limit, title):
data[field] = data[field].fillna('NA')
data = data[field].value_counts().to_frame()
total = data[field].sum()
data['percentage'] = 100 * data[field]/total
percent_limit = percent_limit
otherdata = data[data['percentage'] < percent_limit]
others = otherdata['percentage'].sum()
maindata = data[data['percentage'] >= percent_limit]
data = maindata
other_label = "Others(<" + str(percent_limit) + "% each)"
data.loc[other_label] = pd.Series({field:otherdata[field].sum()})
labels = data.index.tolist()
datavals = data[field].tolist()
trace=go.Pie(labels=labels,values=datavals)
layout = go.Layout(
title = title,
height=600,
width=600
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
pie_count(df, 'user_location', 0.5, 'Number of tweets per location')
Now it's time to check last one categorical feature - source. Lets see top 40 sources by the number of tweets.
ds = df['source'].value_counts().reset_index()
ds.columns = ['source', 'count']
ds = ds.sort_values(['count'])
fig = px.bar(
ds.tail(40),
x="count",
y="source",
orientation='h',
title='Top 40 user sources by number of tweets',
width=800,
height=800
)
fig.show()
Lets create new feature - hashtags_count that will show us how many hashtags in the current tweet.
df['hashtags'] = df['hashtags'].fillna('[]')
df['hashtags_count'] = df['hashtags'].apply(lambda x: len(x))
df.loc[df['hashtags'] == '[]', 'hashtags_count'] = 0
df[['hashtags','hashtags_count']].head()
| hashtags | hashtags_count | |
|---|---|---|
| 0 | [] | 0 |
| 1 | ['COVID19'] | 11 |
| 2 | [] | 0 |
| 3 | ['COVID19'] | 11 |
| 4 | ['COVID19'] | 11 |
And see the values for new created column.
df['hashtags_count'].describe()
count 178683.000000 mean 18.592849 std 20.652898 min 0.000000 25% 0.000000 50% 11.000000 75% 26.000000 max 146.000000 Name: hashtags_count, dtype: float64
fig = px.scatter(
df,
x=df['hashtags_count'],
y=df['tweets_count'],
height=700,
width=700,
title='Total number of tweets for users and number of hashtags in every tweet'
)
fig.show()
Distribution of new feature over the number of tweets is expected - a lot of tweets with few number of hashtags and few tweets with huge number of hashtags.
ds = df['hashtags_count'].value_counts().reset_index()
ds.columns = ['hashtags_count', 'count']
ds = ds.sort_values(['count'])
ds['hashtags_count'] = ds['hashtags_count'].astype(str) + ' tags'
fig = px.bar(
ds,
x="count",
y="hashtags_count",
orientation='h',
title='Distribution of number of hashtags in tweets',
width=800,
height=600
)
fig.show()
Now we will see top 40 users that like to use hashtags a little bit more than others.
ds = df[df['tweets_count']>10]
ds = ds.groupby(['user_name', 'tweets_count'])['hashtags_count'].mean().reset_index()
ds.columns = ['user', 'tweets_count', 'mean_count']
ds = ds.sort_values(['mean_count'])
fig = px.bar(
ds.tail(40),
x="mean_count",
y="user",
color='tweets_count',
orientation='h',
title='Top 40 users with higher mean number of hashtags (at least 10 tweets per user)',
width=800,
height=800
)
fig.show()
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['date'])
df['day'] = df['date'].astype(str).str.split(' ', expand=True)[0]
df['time'] = df['date'].astype(str).str.split(' ', expand=True)[1]
df.head()
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | country | tweets_count | year_created | hashtags_count | day | time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54713 | 17175 | Jessica Luther Rummel | Denton, Texas | Mother. Partner. Scholar. Activist. Alchemist.... | 2020-07-05 03:28:16 | 234 | 28 | 220 | False | 2020-07-24 23:47:08 | PROTESTORS NEEDED 6PM-10PM DAILY (NOW). 1450 E... | [] | Twitter Web App | False | united states | 1 | 2020 | 0 | 2020-07-24 | 23:47:08 |
| 54710 | 17172 | Clive Gorman | Victoria, British Columbia | Marketing Director @CNEGames for @idlechampion... | 2011-12-23 02:02:35 | 597 | 224 | 23824 | False | 2020-07-24 23:47:12 | Don't take your eye off the ball. There are st... | ['COVID19'] | Twitter for Android | False | canada | 2 | 2011 | 11 | 2020-07-24 | 23:47:12 |
| 50431 | 17173 | Patty Hayes | Seattle, WA | Director of Public Health - Seattle & King Cou... | 2017-07-07 18:56:50 | 718 | 162 | 2276 | False | 2020-07-24 23:47:12 | Excellent partnership to prevent #COVID19, tha... | ['COVID19'] | Twitter for Android | False | ghana | 2 | 2017 | 11 | 2020-07-24 | 23:47:12 |
| 54712 | 17174 | Dr. Lipi #TrustDrFauci Roy | New York, USA | @NBCNews @MSNBC @Forbes Medical Contributor | ... | 2009-10-11 18:46:51 | 12485 | 4603 | 38120 | True | 2020-07-24 23:47:12 | Always honored to speak w @NicolleDWallace abo... | ['covid19'] | Twitter Web App | False | united states | 1 | 2009 | 11 | 2020-07-24 | 23:47:12 |
| 54709 | 17171 | Tristyn Russelo | Alberta, Canada | NaN | 2017-06-14 22:01:54 | 5 | 68 | 57 | False | 2020-07-24 23:47:13 | Apparently, at the Timberlea @saveonfoods stor... | [] | Twitter Web App | False | canada | 1 | 2017 | 0 | 2020-07-24 | 23:47:13 |
ds = df.groupby(['day', 'user_name'])['hashtags_count'].count().reset_index()
ds = ds.groupby(['day'])['user_name'].count().reset_index()
ds.columns = ['day', 'number_of_users']
ds['day'] = ds['day'].astype(str) + ':00:00:00'
fig = px.bar(
ds,
x='day',
y="number_of_users",
orientation='v',
title='Number of unique users per day',
width=800,
height=800
)
fig.show()
ds = df['day'].value_counts().reset_index()
ds.columns = ['day', 'count']
ds = ds.sort_values('count')
ds['day'] = ds['day'].astype(str) + ':00:00:00'
fig = px.bar(
ds,
x='count',
y="day",
orientation='h',
title='Tweets distribution over days present in dataset',
width=800,
height=800
)
fig.show()
df['hour'] = df['date'].dt.hour
ds = df['hour'].value_counts().reset_index()
ds.columns = ['hour', 'count']
ds['hour'] = 'Hour ' + ds['hour'].astype(str)
fig = px.bar(
ds,
x="hour",
y="count",
orientation='v',
title='Tweets distribution over hours',
width=800
)
fig.show()
def split_hashtags(x):
return str(x).replace('[', '').replace(']', '').split(',')
tweets_df = df.copy()
tweets_df['hashtag'] = tweets_df['hashtags'].apply(lambda row : split_hashtags(row))
tweets_df = tweets_df.explode('hashtag')
tweets_df['hashtag'] = tweets_df['hashtag'].astype(str).str.lower().str.replace("'", '').str.replace(" ", '')
tweets_df.loc[tweets_df['hashtag']=='', 'hashtag'] = 'NO HASHTAG'
tweets_df.head()
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | ... | source | is_retweet | country | tweets_count | year_created | hashtags_count | day | time | hour | hashtag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54713 | 17175 | Jessica Luther Rummel | Denton, Texas | Mother. Partner. Scholar. Activist. Alchemist.... | 2020-07-05 03:28:16 | 234 | 28 | 220 | False | 2020-07-24 23:47:08 | ... | Twitter Web App | False | united states | 1 | 2020 | 0 | 2020-07-24 | 23:47:08 | 23 | NO HASHTAG |
| 54710 | 17172 | Clive Gorman | Victoria, British Columbia | Marketing Director @CNEGames for @idlechampion... | 2011-12-23 02:02:35 | 597 | 224 | 23824 | False | 2020-07-24 23:47:12 | ... | Twitter for Android | False | canada | 2 | 2011 | 11 | 2020-07-24 | 23:47:12 | 23 | covid19 |
| 50431 | 17173 | Patty Hayes | Seattle, WA | Director of Public Health - Seattle & King Cou... | 2017-07-07 18:56:50 | 718 | 162 | 2276 | False | 2020-07-24 23:47:12 | ... | Twitter for Android | False | ghana | 2 | 2017 | 11 | 2020-07-24 | 23:47:12 | 23 | covid19 |
| 54712 | 17174 | Dr. Lipi #TrustDrFauci Roy | New York, USA | @NBCNews @MSNBC @Forbes Medical Contributor | ... | 2009-10-11 18:46:51 | 12485 | 4603 | 38120 | True | 2020-07-24 23:47:12 | ... | Twitter Web App | False | united states | 1 | 2009 | 11 | 2020-07-24 | 23:47:12 | 23 | covid19 |
| 54709 | 17171 | Tristyn Russelo | Alberta, Canada | NaN | 2017-06-14 22:01:54 | 5 | 68 | 57 | False | 2020-07-24 23:47:13 | ... | Twitter Web App | False | canada | 1 | 2017 | 0 | 2020-07-24 | 23:47:13 | 23 | NO HASHTAG |
5 rows × 22 columns
ds = tweets_df['hashtag'].value_counts().reset_index()
ds.columns = ['hashtag', 'count']
ds = ds.sort_values(['count'])
fig = px.bar(
ds.tail(20),
x="count",
y='hashtag',
orientation='h',
title='Top 20 hashtags',
width=800,
height=700
)
fig.show()
df['tweet_length'] = df['text'].str.len()
fig = px.histogram(
df,
x="tweet_length",
nbins=80,
title='Tweet length distribution',
width=800,
height=700
)
fig.show()
ds = df[df['tweets_count']>=10]
ds = ds.groupby(['user_name', 'tweets_count'])['tweet_length'].mean().reset_index()
ds.columns = ['user_name', 'tweets_count', 'mean_length']
ds = ds.sort_values(['mean_length'])
fig = px.bar(
ds.tail(40),
x="mean_length",
y="user_name",
color='tweets_count',
orientation='h',
title='Top 40 users with the longest average length of tweet (at least 10 tweets)',
width=800,
height=800
)
fig.show()
ds = df[df['tweets_count']>=10]
ds = ds.groupby(['user_name', 'tweets_count'])['tweet_length'].mean().reset_index()
ds.columns = ['user_name', 'tweets_count', 'mean_length']
ds = ds.sort_values(['mean_length'])
fig = px.bar(
ds.head(40),
x="mean_length",
y="user_name",
color='tweets_count',
orientation='h',
title='Top 40 users with the shortest average length of tweet (at least 10 tweets)',
width=800,
height=800
)
fig.show()
def build_wordcloud(df, title):
wordcloud = WordCloud(
background_color='white',
stopwords=set(STOPWORDS),
max_words=50,
max_font_size=40,
random_state=666
).generate(str(df))
fig = plt.figure(1, figsize=(14,14))
plt.axis('off')
fig.suptitle(title, fontsize=16)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
build_wordcloud(df['text'].tolist(), 'Revalent words in tweets for all dataset')
test_df = df[df['user_name']=='CNN']
build_wordcloud(test_df['text'].tolist(), 'Revalent words in tweets for Reuters')
test_df = df[df['user_name']=='Shashan92009841']
build_wordcloud(test_df['text'].tolist(), 'Prevalent words in tweets for Shashan92009841')
test_df = df[df['user_name']=='millyjueey57']
build_wordcloud(test_df['text'].tolist(), 'Prevalent words in tweets for millyjueey57')
build_wordcloud(df['user_description'], 'Prevalent words in tweets for Blood Donors India')
# vec = TfidfVectorizer(stop_words="english")
# vec.fit(df['text'].values)
# features = vec.transform(df['text'].values)
# kmeans = KMeans(n_clusters=3, random_state=0)
# kmeans.fit(features)
# res = kmeans.predict(features)
# df['Cluster'] = res
# df.head()
# df[df['Cluster'] == 0].head(10)['text'].tolist()
# df[df['Cluster'] == 1].head(10)['text'].tolist()
# df[df['Cluster'] == 2].head(10)['text'].tolist()
# print('Number of samples for class 0: ', len(df[df['Cluster'] == 0]))
# print('Number of samples for class 1: ', len(df[df['Cluster'] == 1]))
# print('Number of samples for class 2: ', len(df[df['Cluster'] == 2]))
# for i in range(3):
# build_wordcloud(df[df['Cluster'] == i]['text'].tolist(), 'Wordcloud for cluster ' + str(i))
# feature = df['Cluster']
# top_n_active_cities = feature.value_counts()
# print(top_n_active_cities)
# ncount = feature.shape[0]
# plt.figure(figsize=(12,5))
# chart = sns.countplot(feature, order=feature.value_counts().index)
# plt.xticks(
# rotation=45,
# horizontalalignment='right',
# fontweight='light',
# fontsize='x-large'
# )
# chart.set_title('Sentiment Analysis of Tweets', fontsize=15)
# chart.set_xlabel("Sentiment",fontsize=20)
# chart.set_ylabel("Counts (Number of Tweets)",fontsize=20)
# # Make twin axis
# chart2 = chart.twinx()
# # Switch so count axis is on right, frequency on left
# chart2.yaxis.tick_left()
# chart.yaxis.tick_right()
# # Also switch the labels over
# chart.yaxis.set_label_position('right')
# chart2.yaxis.set_label_position('left')
# chart2.set_ylabel('Frequency [%]',fontsize=20)
# for p in chart.patches:
# x=p.get_bbox().get_points()[:,0]
# y=p.get_bbox().get_points()[1,1]
# chart.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y),
# ha='center', va='bottom') # set the alignment of the text
# # Use a LinearLocator to ensure the correct number of ticks
# chart.yaxis.set_major_locator(ticker.LinearLocator(11))
# # Fix the frequency range to 0-100
# chart2.set_ylim(0,100)
# chart.set_ylim(0,ncount)
# # And use a MultipleLocator to ensure a tick spacing of 10
# chart2.yaxis.set_major_locator(ticker.MultipleLocator(10))
# plt.show()
import re, string
def clean_tweets(text):
'''
Utility function to clean tweet text by removing links, special characters ..etc
using simple regex statements.
'''
# Remove RT
text = re.sub("RT @[\w]*:","", text)
# Remove citations
text = re.sub("@[\w]*:","", text)
# Remove hyperlinks
text = re.sub("https?://[A-Za-z0-9./]","", text)
# Remove linebreak, tab, return
text = re.sub('[\n\t\r]+', ' ', text)
# Remove tickers
text = re.sub('\$\w*', '', text)
# Remove punctuation
text = re.sub('[' + string.punctuation + ']+', '', text)
# Remove quotes
text = re.sub('\&*[amp]*\;|gt+', '', text)
# Remove via with blank
text = re.sub('via+\s', '', text)
# Remove multiple whitespace
text = re.sub('\s+\s+', ' ', text)
# Remove multiple whitespace
text = re.sub('\s+\s+', ' ', text)
# Remove HashTags
text = re.sub('\#+[\w_]+[\w\'_\-]*[\w_]+', ' ', text)
# Remove Smileys
text = re.sub('[:=]+(|o|O| )+[D\)\]]+[\(\[]+[pP]+[doO/\\]+[\(\[]+(\^_\^|)', ' ', text)
return text.lower().strip()
def get_tweet_sentiment(tweet):
'''
Utility function to classify sentiment of passed tweet
using textblob's sentiment method
'''
# create TextBlob object of passed tweet text
analysis = TextBlob(tweet)
# set sentiment
if analysis.sentiment.polarity > 0:
return 'positive'
elif analysis.sentiment.polarity == 0:
return 'neutral'
else:
return 'negative'
def regularExpression(textToFilter):
filteredTweet = []
retweetPattern = 'RT|@RT'
urlPattern = 'https://[a-zA-Z0-9+&@#/%?=~_|!:,.;]*'
for textLine in textToFilter:
tweet = re.sub(retweetPattern,'',textLine)
tweet = re.sub(urlPattern,'',tweet)
filteredTweet.append(tweet)
return filteredTweet
def nltkTokenizer(textToTokenize):
filteredSentence = []
usersPattern = re.compile('@[a-zA-Z0-9]*',re.UNICODE)
hashtagPattern = re.compile('#[a-zA-Z0-9]*',re.UNICODE)
stop_words = stopwords.words('english')
for textLine in textToTokenize:
words = re.sub(usersPattern,'',textLine)
words = re.sub(hashtagPattern,'',words)
words = word_tokenize(words)
for w in words:
if w not in stop_words and w not in '@' and w not in '#':
filteredSentence.append(w)
return filteredSentence
def tweet_to_words(raw_tweet):
tweet = ''.join(c for c in raw_tweet if c not in string.punctuation)
tweet = re.sub('((www\S+)|(http\S+))', 'urlsite', tweet)
tweet = re.sub(r'\d+', 'contnum', tweet)
tweet = re.sub(' +',' ', tweet)
words = tweet.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return( " ".join( meaningful_words ))
def users(tweet):
user = []
usersPattern = re.compile('@[a-zA-Z0-9]*',re.UNICODE)
for t in tweet:
u = re.findall(usersPattern,t)
user.append(u)
return user
def split_into_tokens(Text):
return TextBlob(Text).words
def split_into_lemmas(Text):
Text = Text.lower()
words = TextBlob(Text).words
# for each word, take its "base form" = lemma
return [word.lemma for word in words]
df["cleaned_text"] = df["text"].apply(lambda tweet: clean_tweets(tweet))
df["cleaned_text"].head()
54713 protestors needed 6pm10pm daily now 1450 e mck... 54710 dont take your eye off the ball there are stil... 50431 excellent partnership to prevent covid19 thank... 54712 always honored to speak w nicolledwallace abou... 54709 apparently at the timberlea saveonfoods store ... Name: cleaned_text, dtype: object
# get sentiment using TextBlob
df["sentiment"] = df["cleaned_text"].parallel_apply(get_tweet_sentiment)
df["sentiment"].value_counts()
neutral 74757 positive 74339 negative 29587 Name: sentiment, dtype: int64
# saving datafrme
df.to_csv("covid_tweets_with_sentiments_{}.csv".format(datetime.date.today()), index=False)
# import spacy
# nlp = spacy.load("en_core_web_sm")
# df["entities"] = df["cleaned_text"].parallel_apply(lambda x: [(ent.text, ent.label_)
# if (not ent.text.startswith("#"))
# else "" for ent in nlp(x).ents])
# entities_graph = []
# for i in df["entities"]:
# for j in i:
# entities_graph.append(j)
Assigning entities in a list
# organisation = []
# person = []
# geopolitical_entity = []
# norp = []
# cardinal = []
# for i in entities_graph:
# for j in i:
# if j=="ORG":
# organisation.append(j)
# elif j=="PERSON":
# person.append(j)
# elif j=="GPE":
# geopolitical_entity. append(j)
# elif j=="NORP":
# norp.append(j)
# elif j=="CARDINAL":
# cardinal.append(j)
# else: pass
# organisation = len(organisation)
# person = len(person)
# geopolitical_entity = len(geopolitical_entity)
# norp = len(norp)
# cardinal = len(cardinal)
# classes = ["organisation" , "person", "geopolitical_entity", "norp", "cardinal"]
Dividing the origin of all tweets
# import matplotlib.pyplot as pt
# plt.figure(figsize=(16,9))
# graph = [organisation, person, geopolitical_entity, norp,cardinal]
# colors = ["c","b","r","g","y"]
# explode = [0,0.2,0.1,0,0]
# textprops = {"Fontsize":15}
# wedgeprops = {"linewidth": 4,
# "width": 1,
# "edgecolor" : "k"}
# fontdict = {"Fontsize":25}
# plt.title("Analysis of Entities", fontdict=fontdict)
# plt.pie(graph,
# labels = classes,
# explode = explode,
# colors = colors,
# autopct = "%0.2f%%",
# radius = 1,
# textprops = textprops,
# pctdistance = 0.6,
# labeldistance = 1.1,
# wedgeprops = wedgeprops,
# rotatelabels = False )
# plt.legend(loc = 3)
# plt.savefig("entities_piechart", dpi = 1000, quality = 99)
# plt.show()
The tweets having a positive, negative, or neutral sentiment have already been determined. Here, the coding is done to display this information pictorially using explode library, to make a pie chart to display this data (Figures 27 and 28). The classification of the tweets in the three classes is 94.6% neutral, .6% negative, and .2% positive.
import matplotlib.ticker as ticker
def show_sentiment_analysis_barplot(dataframe, sentiment_column, country="all"):
if country != "all":
dataframe = dataframe[dataframe.country == country]
feature = dataframe[sentiment_column]
top_n_active_cities = feature.value_counts()
print(top_n_active_cities)
ncount = feature.shape[0]
plt.figure(figsize=(12,5))
chart = sns.countplot(feature, order=feature.value_counts().index)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
if country == "all":
chart.set_title('Sentiment Analysis of {} Tweets in all countries'.format(ncount), fontsize=15)
else:
chart.set_title('Sentiment Analysis of {} Tweets in {}'.format(ncount,' '.join([name.capitalize() for name in country.split(" ")])), fontsize=15)
chart.set_xlabel("Sentiment",fontsize=20)
chart.set_ylabel("Counts (Number of Tweets)",fontsize=20)
# Make twin axis
chart2 = chart.twinx()
# Switch so count axis is on right, frequency on left
chart2.yaxis.tick_left()
chart.yaxis.tick_right()
# Also switch the labels over
chart.yaxis.set_label_position('right')
chart2.yaxis.set_label_position('left')
chart2.set_ylabel('Frequency [%]',fontsize=20)
for p in chart.patches:
x=p.get_bbox().get_points()[:,0]
y=p.get_bbox().get_points()[1,1]
chart.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y),
ha='center', va='bottom') # set the alignment of the text
# Use a LinearLocator to ensure the correct number of ticks
chart.yaxis.set_major_locator(ticker.LinearLocator(11))
# Fix the frequency range to 0-100
chart2.set_ylim(0,100)
chart.set_ylim(0,ncount)
# And use a MultipleLocator to ensure a tick spacing of 10
chart2.yaxis.set_major_locator(ticker.MultipleLocator(10))
plt.show()
show_sentiment_analysis_barplot(dataframe=df, sentiment_column='sentiment')
neutral 74756 positive 74337 negative 29587 Name: sentiment, dtype: int64
df['location'] = df['user_location'].str.split(',', expand=True)[1].str.lstrip().str.rstrip()
res = df.groupby(['location'])['text'].count().reset_index()
df.head()
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | ... | tweets_count | year_created | hashtags_count | day | time | hour | tweet_length | cleaned_text | sentiment | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54713 | 17175 | Jessica Luther Rummel | Denton, Texas | Mother. Partner. Scholar. Activist. Alchemist.... | 2020-07-05 03:28:16 | 234 | 28 | 220 | False | 2020-07-24 23:47:08 | ... | 1 | 2020 | 0 | 2020-07-24 | 23:47:08 | 23 | 140 | protestors needed 6pm10pm daily now 1450 e mck... | negative | Texas |
| 54710 | 17172 | Clive Gorman | Victoria, British Columbia | Marketing Director @CNEGames for @idlechampion... | 2011-12-23 02:02:35 | 597 | 224 | 23824 | False | 2020-07-24 23:47:12 | ... | 2 | 2011 | 11 | 2020-07-24 | 23:47:12 | 23 | 139 | dont take your eye off the ball there are stil... | positive | British Columbia |
| 50431 | 17173 | Patty Hayes | Seattle, WA | Director of Public Health - Seattle & King Cou... | 2017-07-07 18:56:50 | 718 | 162 | 2276 | False | 2020-07-24 23:47:12 | ... | 2 | 2017 | 11 | 2020-07-24 | 23:47:12 | 23 | 96 | excellent partnership to prevent covid19 thank... | positive | WA |
| 54712 | 17174 | Dr. Lipi #TrustDrFauci Roy | New York, USA | @NBCNews @MSNBC @Forbes Medical Contributor | ... | 2009-10-11 18:46:51 | 12485 | 4603 | 38120 | True | 2020-07-24 23:47:12 | ... | 1 | 2009 | 11 | 2020-07-24 | 23:47:12 | 23 | 136 | always honored to speak w nicolledwallace abou... | neutral | USA |
| 54709 | 17171 | Tristyn Russelo | Alberta, Canada | NaN | 2017-06-14 22:01:54 | 5 | 68 | 57 | False | 2020-07-24 23:47:13 | ... | 1 | 2017 | 0 | 2020-07-24 | 23:47:13 | 23 | 140 | apparently at the timberlea saveonfoods store ... | positive | Canada |
5 rows × 25 columns
country_dict = {}
for c in countries:
country_dict[c.name] = c.alpha3
res['alpha3'] = res['location']
res = res.replace({"alpha3": country_dict})
country_list = tw['Country'].tolist()[:5]
res = res[
(res['alpha3'] == 'USA') |
(res['location'].isin(country_list)) |
(res['location'] != res['alpha3'])
]
gbr = ['England', 'UK', 'London', 'United Kingdom']
us = ['United States', 'NY', 'CA', 'GA']
res = res[res['location'].notnull()]
res.loc[res['location'].isin(gbr), 'alpha3'] = 'GBR'
res.loc[res['location'].isin(us), 'alpha3'] = 'USA'
res.loc[res['alpha3'] == 'USA', 'location'] = 'USA'
res.loc[res['alpha3'] == 'GBR', 'location'] = 'United Kingdom'
plot = res.groupby(['location', 'alpha3'])['text'].sum().reset_index()
plot.head()
| location | alpha3 | text | |
|---|---|---|---|
| 0 | Afghanistan | AFG | 24 |
| 1 | Albania | ALB | 2 |
| 2 | Argentina | ARG | 12 |
| 3 | Armenia | ARM | 10 |
| 4 | Australia | AUS | 1061 |
fig = px.choropleth(
plot,
locations="alpha3",
hover_name='location',
color="text",
projection="natural earth",
color_continuous_scale=px.colors.sequential.Plasma,
title='Tweets from different countries for every day',
width=800,
height=600
)
fig.show()
res = df.groupby([ 'location', 'user_name'])['text'].count().reset_index()
res = res[['location', 'user_name']]
res['alpha3'] = res['location']
res = res.replace({"alpha3": country_dict})
country_list = tw['Country'].tolist()[:5]
res = res[
(res['alpha3'] == 'USA') |
(res['location'].isin(country_list)) |
(res['location'] != res['alpha3'])
]
gbr = ['England', 'UK', 'London', 'United Kingdom']
us = ['United States', 'NY', 'CA', 'GA']
res = res[res['location'].notnull()]
res.loc[res['location'].isin(gbr), 'alpha3'] = 'GBR'
res.loc[res['location'].isin(us), 'alpha3'] = 'USA'
res.loc[res['alpha3'] == 'USA', 'location'] = 'USA'
res.loc[res['alpha3'] == 'GBR', 'location'] = 'United Kingdom'
plot = res.groupby(['location', 'alpha3'])['user_name'].count().reset_index()
fig = px.choropleth(
plot,
locations="alpha3",
hover_name='location',
color="user_name",
projection="natural earth",
color_continuous_scale=px.colors.sequential.Plasma,
title='Numbers of active users for every day',
width=800,
height=600
)
fig.show()
df = pd.read_csv("covid_tweets_with_sentiments_{}.csv".format(datetime.date.today()))
print(df.shape)
df.head()
(178688, 24)
/home/ali/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (0,8,13) have mixed types.Specify dtype option on import or set low_memory=False.
| index | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | ... | country | tweets_count | year_created | hashtags_count | day | time | hour | tweet_length | cleaned_text | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17175 | Jessica Luther Rummel | Denton, Texas | Mother. Partner. Scholar. Activist. Alchemist.... | 2020-07-05 03:28:16 | 234.0 | 28.0 | 220.0 | False | 2020-07-24 23:47:08 | ... | united states | 1.0 | 2020.0 | 0.0 | 2020-07-24 | 23:47:08 | 23.0 | 140.0 | protestors needed 6pm10pm daily now 1450 e mck... | negative |
| 1 | 17172 | Clive Gorman | Victoria, British Columbia | Marketing Director @CNEGames for @idlechampion... | 2011-12-23 02:02:35 | 597.0 | 224.0 | 23824.0 | False | 2020-07-24 23:47:12 | ... | canada | 2.0 | 2011.0 | 11.0 | 2020-07-24 | 23:47:12 | 23.0 | 139.0 | dont take your eye off the ball there are stil... | positive |
| 2 | 17173 | Patty Hayes | Seattle, WA | Director of Public Health - Seattle & King Cou... | 2017-07-07 18:56:50 | 718.0 | 162.0 | 2276.0 | False | 2020-07-24 23:47:12 | ... | ghana | 2.0 | 2017.0 | 11.0 | 2020-07-24 | 23:47:12 | 23.0 | 96.0 | excellent partnership to prevent covid19 thank... | positive |
| 3 | 17174 | Dr. Lipi #TrustDrFauci Roy | New York, USA | @NBCNews @MSNBC @Forbes Medical Contributor | ... | 2009-10-11 18:46:51 | 12485.0 | 4603.0 | 38120.0 | True | 2020-07-24 23:47:12 | ... | united states | 1.0 | 2009.0 | 11.0 | 2020-07-24 | 23:47:12 | 23.0 | 136.0 | always honored to speak w nicolledwallace abou... | neutral |
| 4 | 17171 | Tristyn Russelo | Alberta, Canada | NaN | 2017-06-14 22:01:54 | 5.0 | 68.0 | 57.0 | False | 2020-07-24 23:47:13 | ... | canada | 1.0 | 2017.0 | 0.0 | 2020-07-24 | 23:47:13 | 23.0 | 140.0 | apparently at the timberlea saveonfoods store ... | positive |
5 rows × 24 columns
df.columns
Index(['index', 'user_name', 'user_location', 'user_description',
'user_created', 'user_followers', 'user_friends', 'user_favourites',
'user_verified', 'date', 'text', 'hashtags', 'source', 'is_retweet',
'country', 'tweets_count', 'year_created', 'hashtags_count', 'day',
'time', 'hour', 'tweet_length', 'cleaned_text', 'sentiment'],
dtype='object')
Build a module for text standardization
# get topN similar sentences for a given sentence using Cosine Similarity
def most_similar_sentence(sentence, all_sentences):
# print("Given sentence: ",sentence)
max_cosine_sim = 0.5
most_sim_sentence = ''
most_sim_sentence_index = 0
for index, sim_sentence in enumerate(all_sentences):
cosine = Cosine(2)
s0 = sentence
s1 = sim_sentence
p0 = cosine.get_profile(s0)
p1 = cosine.get_profile(s1)
if p1 and cosine.similarity_profiles(p0, p1) > max_cosine_sim:
max_cosine_sim = cosine.similarity_profiles(p0, p1)
most_sim_sentence = sim_sentence
most_sim_sentence_index = index
sentences_cosine_similarities = (s0,most_sim_sentence,max_cosine_sim)
# print("Most similar sentence: {}".format(most_sim_sentence).encode('utf-8').strip())
return (most_sim_sentence, max_cosine_sim, most_sim_sentence_index)
country = "India"
sim_country = most_similar_sentence(country, df.country.dropna().unique().tolist())[0]
show_sentiment_analysis_barplot(dataframe=df, sentiment_column='sentiment', country=sim_country)
positive 8055 neutral 7873 negative 2444 Name: sentiment, dtype: int64
build_wordcloud(df[df['country'] == sim_country]['text'].tolist(),
'Wordcloud for {} Tweets'.format(' '.join([name.capitalize() for name in sim_country.split(" ")])))
build_wordcloud(df[(df['country'] == sim_country) & (df['sentiment'] == 'positive')]['text'].tolist(),
'Wordcloud for {} Positive Tweets'.format(' '.join([name.capitalize() for name in sim_country.split(" ")])))
build_wordcloud(df[(df['country'] == sim_country) & (df['sentiment'] == 'negative')]['text'].tolist(),
'Wordcloud for {} Negative Tweets'.format(' '.join([name.capitalize() for name in sim_country.split(" ")])))
build_wordcloud(df[(df['country'] == sim_country) & (df['sentiment'] == 'neutral')]['text'].tolist(),
'Wordcloud for {} Neutral Tweets'.format(' '.join([name.capitalize() for name in sim_country.split(" ")])))
import covid_daily
covid_daily_df = covid_daily.overview(as_json=False)
covid_daily_df.head()
| Country,Other | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | TotCases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | Population | 1 Caseevery X ppl | 1 Deathevery X ppl | 1 Testevery X ppl | New Cases/1M pop | New Deaths/1M pop | Active Cases/1M pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | World | 207632645 | 154078 | 4368898 | 2725 | 186106296 | 126643 | 17157451 | 106638 | 26637 | 560 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | USA | 37435835 | 0 | 637439 | 0 | 30144609 | 0 | 6653787 | 20061 | 112362 | 1913 | 554743011 | 1665032 | 333172543 | 9 | 523 | 1 | 0 | 0 | 19971 |
| 2 | India | 32192576 | 622 | 431253 | 0 | 31376015 | 7555 | 385308 | 8944 | 23074 | 309 | 493624440 | 353812 | 1395161030 | 43 | 3235 | 3 | 0 | 0 | 276 |
| 3 | Brazil | 20350142 | 0 | 568833 | 0 | 19195514 | 0 | 585795 | 8318 | 94984 | 2655 | 56023684 | 261490 | 214248197 | 11 | 377 | 4 | 0 | 0 | 2734 |
| 4 | Russia | 6600836 | 21624 | 170499 | 816 | 5884316 | 16426 | 546021 | 2300 | 45210 | 1168 | 171600000 | 1175306 | 146004480 | 22 | 856 | 1 | 148 | 6 | 3740 |
def all_world_population_countries_names():
url = "https://world-population.p.rapidapi.com/allcountriesname"
headers = {
'x-rapidapi-key': "134117ae79msh40bb2931f9c7e4ap1aa445jsn8d1982670b09",
'x-rapidapi-host': "world-population.p.rapidapi.com"
}
response = requests.request("GET", url, headers=headers)
return response.json()
def get_world_population_per_country(country):
url = "https://world-population.p.rapidapi.com/population"
querystring = {"country_name":country}
headers = {
'x-rapidapi-key': "134117ae79msh40bb2931f9c7e4ap1aa445jsn8d1982670b09",
'x-rapidapi-host': "world-population.p.rapidapi.com"
}
response = requests.request("GET", url, headers=headers, params=querystring)
return response.json()
def get_covid19api_all_countries():
url = "https://api.covid19api.com/countries"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload, timeout=10)
return response.json()
def get_covid19api_stats_country(country , from_date , to_date ):
url = "https://api.covid19api.com/total/country/{}?from={}&to={}".format(country , from_date , to_date )
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data = payload )
return response
def get_covid_observer_countries():
url = "https://covid.observer/us/#countries"
response = requests.request("GET", url, timeout=10)
soup = BeautifulSoup(response.content, "html.parser")
countries_list_element = soup.find_all("div",{"class":"countries-list"})[2]
result = {}
for a_tag in countries_list_element.find_all("a"):
result[a_tag.text] = "https://covid.observer" + a_tag['href']
return result
# get world countries
all_countries = get_covid_observer_countries()
# get most similar country
sim_country = most_similar_sentence(country, all_countries.keys())[0]
# get API url for the similar country
url = all_countries[sim_country]
# make a get request
page = requests.get(url)
# declare a BeautifulSoup object from request content
soup = BeautifulSoup(page.text, 'lxml')
# text mining on BeautifulSoup object
table_data = soup.find('table')
headers = []
for i in table_data.find_all('th'):
title = i.text
headers.append(title)
# creatr a DataFrame
specific_country_stat_df = pd.DataFrame(columns = headers)
for j in table_data.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [tr.text for tr in row_data]
length = len(specific_country_stat_df)
specific_country_stat_df.loc[length] = row
specific_country_stat_df.columns = ['date','confirmed_cases','daily_growth','recovered_cases',
'fatal_cases','active_cases','recovery_rate','mortality_rate',
'affected_population','confirmed_per_1000','died_per_1000']
specific_country_stat_df['index'] = specific_country_stat_df.index
specific_country_stat_df['country'] = sim_country
specific_country_stat_df.head()
| date | confirmed_cases | daily_growth | recovered_cases | fatal_cases | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aug 13 | 32,117,826 | 0.0 % | 0 | 430,254 | 31,687,572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 0 | India |
| 1 | Aug 12 | 32,117,826 | 0.1 % | 0 | 430,254 | 31,687,572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 1 | India |
| 2 | Aug 11 | 32,077,706 | 0.1 % | 0 | 429,669 | 31,648,037 | 0.0 % | 1.3 % | 2.3 % | 23.5 | 0.31 | 2 | India |
| 3 | Aug 10 | 32,036,511 | 0.1 % | 0 | 429,179 | 31,607,332 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 3 | India |
| 4 | Aug 9 | 31,998,158 | 0.1 % | 0 | 428,682 | 31,569,476 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 4 | India |
for col in ['confirmed_cases', 'recovered_cases', 'fatal_cases','active_cases']:
specific_country_stat_df[col] = specific_country_stat_df[col].parallel_apply(lambda x : int(x.replace(",","")))
specific_country_stat_df[col] = specific_country_stat_df[col].astype(int)
specific_country_stat_df.head()
| date | confirmed_cases | daily_growth | recovered_cases | fatal_cases | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aug 13 | 32117826 | 0.0 % | 0 | 430254 | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 0 | India |
| 1 | Aug 12 | 32117826 | 0.1 % | 0 | 430254 | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 1 | India |
| 2 | Aug 11 | 32077706 | 0.1 % | 0 | 429669 | 31648037 | 0.0 % | 1.3 % | 2.3 % | 23.5 | 0.31 | 2 | India |
| 3 | Aug 10 | 32036511 | 0.1 % | 0 | 429179 | 31607332 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 3 | India |
| 4 | Aug 9 | 31998158 | 0.1 % | 0 | 428682 | 31569476 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 4 | India |
Create a new column for the date with year
y = [', 2021'] * specific_country_stat_df[specific_country_stat_df['date']=='Jan 1'].index[0]
observations_num_in_2020 = len(specific_country_stat_df) - specific_country_stat_df[specific_country_stat_df['date']=='Jan 1'].index[0]
print(observations_num_in_2020)
y.extend([', 2020'] * observations_num_in_2020)
len(y)
338
562
specific_country_stat_df['year'] = y
specific_country_stat_df['date_with_year'] = specific_country_stat_df['date'] + specific_country_stat_df['year']
specific_country_stat_df['date_with_year'].head()
0 Aug 13, 2021 1 Aug 12, 2021 2 Aug 11, 2021 3 Aug 10, 2021 4 Aug 9, 2021 Name: date_with_year, dtype: object
Show the beginning and end of dates
print("Beginning: ", pd.to_datetime(specific_country_stat_df['date_with_year']).min())
print("End: ", pd.to_datetime(specific_country_stat_df['date_with_year']).max())
Beginning: 2020-01-01 00:00:00 End: 2021-08-13 00:00:00
# call API of covid vaccinations
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv"
page = requests.get(url)
# save data into csv file
vaccinations_file_path = 'vaccinations.csv'
vaccinations_file = open(vaccinations_file_path, 'wb')
vaccinations_file.write(page.content)
vaccinations_file.close()
# load csv as dataframe
covid_vaccinations_df = pd.read_csv(vaccinations_file_path)
covid_vaccinations_df.head()
| location | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | total_boosters | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | total_boosters_per_hundred | daily_vaccinations_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2021-02-22 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | NaN |
| 1 | Afghanistan | AFG | 2021-02-23 | NaN | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | NaN | 35.0 |
| 2 | Afghanistan | AFG | 2021-02-24 | NaN | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | NaN | 35.0 |
| 3 | Afghanistan | AFG | 2021-02-25 | NaN | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | NaN | 35.0 |
| 4 | Afghanistan | AFG | 2021-02-26 | NaN | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | NaN | 35.0 |
Get covid vaccinations for specific country
sim_country = most_similar_sentence(specific_country_stat_df['country'].iloc[0], covid_vaccinations_df['location'].unique())[0]
specific_covid_vaccinations_df = covid_vaccinations_df[covid_vaccinations_df.location == sim_country]
specific_covid_vaccinations_df.head()
| location | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | total_boosters | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | total_boosters_per_hundred | daily_vaccinations_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16562 | India | IND | 2021-01-15 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | NaN | NaN |
| 16563 | India | IND | 2021-01-16 | 191181.0 | 191181.0 | NaN | NaN | 191181.0 | 191181.0 | 0.01 | 0.01 | NaN | NaN | 139.0 |
| 16564 | India | IND | 2021-01-17 | 224301.0 | 224301.0 | NaN | NaN | 33120.0 | 112150.0 | 0.02 | 0.02 | NaN | NaN | 81.0 |
| 16565 | India | IND | 2021-01-18 | 454049.0 | 454049.0 | NaN | NaN | 229748.0 | 151350.0 | 0.03 | 0.03 | NaN | NaN | 110.0 |
| 16566 | India | IND | 2021-01-19 | 674835.0 | 674835.0 | NaN | NaN | 220786.0 | 168709.0 | 0.05 | 0.05 | NaN | NaN | 122.0 |
Show the beginning and end of dates
print("Beginning: ", pd.to_datetime(specific_covid_vaccinations_df['date']).min())
print("End: ", pd.to_datetime(specific_covid_vaccinations_df['date']).max())
Beginning: 2021-01-15 00:00:00 End: 2021-08-14 00:00:00
# Parsing dates to datetime
specific_covid_vaccinations_df['date'] = pd.to_datetime(specific_covid_vaccinations_df['date'])
specific_country_stat_df['date_with_year'] = pd.to_datetime(specific_country_stat_df['date_with_year'])
<ipython-input-33-b60022c4c65d>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
specific_country_stat_vac_df = pd.merge(specific_covid_vaccinations_df,
specific_country_stat_df,
how='right',
left_on=["date"],
right_on=["date_with_year"])
specific_country_stat_vac_df.head()
| location | iso_code | date_x | total_vaccinations | people_vaccinated | people_fully_vaccinated | total_boosters | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | ... | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | year | date_with_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | India | IND | 2021-08-13 | 536189903.0 | 416846115.0 | 119343788.0 | NaN | NaN | 5025756.0 | 38.85 | ... | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 0 | India | , 2021 | 2021-08-13 |
| 1 | India | IND | 2021-08-12 | NaN | NaN | NaN | NaN | NaN | 4943267.0 | NaN | ... | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 1 | India | , 2021 | 2021-08-12 |
| 2 | India | IND | 2021-08-11 | 523671019.0 | 406980329.0 | 116690690.0 | NaN | 4590495.0 | 4904103.0 | 37.95 | ... | 31648037 | 0.0 % | 1.3 % | 2.3 % | 23.5 | 0.31 | 2 | India | , 2021 | 2021-08-11 |
| 3 | India | IND | 2021-08-10 | 519080524.0 | 403596088.0 | 115484436.0 | NaN | 4580256.0 | 4827708.0 | 37.61 | ... | 31607332 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 3 | India | , 2021 | 2021-08-10 |
| 4 | India | IND | 2021-08-09 | 514500268.0 | 400158057.0 | 114342211.0 | NaN | 5835509.0 | 5106452.0 | 37.28 | ... | 31569476 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 4 | India | , 2021 | 2021-08-09 |
5 rows × 29 columns
# filter tweets df by specific country
specific_country_tweets_df = df[df.country == most_similar_sentence(country, df['country'].dropna().unique())[0]]
# create a dataframe of groubing by sentiments counts per day
sentiments_counts_df = pd.DataFrame({'count' : specific_country_tweets_df.groupby(['day','sentiment'])['sentiment'].count()}).reset_index()
# create a pivot table for sentiments counts
sentiments_counts_df = sentiments_counts_df.pivot(index="day", columns="sentiment", values="count").reset_index()
sentiments_counts_df.head()
| sentiment | day | negative | neutral | positive |
|---|---|---|---|---|
| 0 | 2020-07-24 | 1.0 | NaN | NaN |
| 1 | 2020-07-25 | 207.0 | 895.0 | 888.0 |
| 2 | 2020-07-26 | 131.0 | 437.0 | 512.0 |
| 3 | 2020-07-27 | 162.0 | 418.0 | 377.0 |
| 4 | 2020-07-28 | 130.0 | 477.0 | 449.0 |
# Make cumulative sum for sentiments counts
for col in ['negative','neutral','positive']:
sentiments_counts_df[col] = sentiments_counts_df[col].cumsum(axis = 0)
sentiments_counts_df.head()
| sentiment | day | negative | neutral | positive |
|---|---|---|---|---|
| 0 | 2020-07-24 | 1.0 | NaN | NaN |
| 1 | 2020-07-25 | 208.0 | 895.0 | 888.0 |
| 2 | 2020-07-26 | 339.0 | 1332.0 | 1400.0 |
| 3 | 2020-07-27 | 501.0 | 1750.0 | 1777.0 |
| 4 | 2020-07-28 | 631.0 | 2227.0 | 2226.0 |
# Parse date of sentiments_counts_df to datetime
sentiments_counts_df['day'] = pd.to_datetime(sentiments_counts_df['day'])
print("Number of intersected dates between specific_country_stat_vac_df and sentiments_counts_df: ",
len(set(specific_country_stat_vac_df['date_with_year'].unique()).intersection(set(sentiments_counts_df['day'].unique()))))
Number of intersected dates between specific_country_stat_vac_df and sentiments_counts_df: 26
specific_country_sentiment_stat_vac_df = pd.merge(sentiments_counts_df,
specific_country_stat_vac_df,
how='right',
left_on=["day"],
right_on=["date_with_year"])
specific_country_sentiment_stat_vac_df.head()
| day | negative | neutral | positive | location | iso_code | date_x | total_vaccinations | people_vaccinated | people_fully_vaccinated | ... | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | year | date_with_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaT | NaN | NaN | NaN | India | IND | 2021-08-13 | 536189903.0 | 416846115.0 | 119343788.0 | ... | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 0 | India | , 2021 | 2021-08-13 |
| 1 | NaT | NaN | NaN | NaN | India | IND | 2021-08-12 | NaN | NaN | NaN | ... | 31687572 | 0.0 % | 1.3 % | 2.4 % | 23.5 | 0.31 | 1 | India | , 2021 | 2021-08-12 |
| 2 | NaT | NaN | NaN | NaN | India | IND | 2021-08-11 | 523671019.0 | 406980329.0 | 116690690.0 | ... | 31648037 | 0.0 % | 1.3 % | 2.3 % | 23.5 | 0.31 | 2 | India | , 2021 | 2021-08-11 |
| 3 | NaT | NaN | NaN | NaN | India | IND | 2021-08-10 | 519080524.0 | 403596088.0 | 115484436.0 | ... | 31607332 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 3 | India | , 2021 | 2021-08-10 |
| 4 | NaT | NaN | NaN | NaN | India | IND | 2021-08-09 | 514500268.0 | 400158057.0 | 114342211.0 | ... | 31569476 | 0.0 % | 1.3 % | 2.3 % | 23.4 | 0.31 | 4 | India | , 2021 | 2021-08-09 |
5 rows × 33 columns
specific_country_sentiment_stat_vac_df.columns
Index(['day', 'negative', 'neutral', 'positive', 'location', 'iso_code',
'date_x', 'total_vaccinations', 'people_vaccinated',
'people_fully_vaccinated', 'total_boosters', 'daily_vaccinations_raw',
'daily_vaccinations', 'total_vaccinations_per_hundred',
'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred',
'total_boosters_per_hundred', 'daily_vaccinations_per_million',
'date_y', 'confirmed_cases', 'daily_growth', 'recovered_cases',
'fatal_cases', 'active_cases', 'recovery_rate', 'mortality_rate',
'affected_population', 'confirmed_per_1000', 'died_per_1000', 'index',
'country', 'year', 'date_with_year'],
dtype='object')
Filter dataframe to have only rows whose date is in tweets dataframe
specific_country_sentiment_stat_vac_df = specific_country_sentiment_stat_vac_df[specific_country_sentiment_stat_vac_df.date_with_year.isin(sentiments_counts_df.day)]
specific_country_sentiment_stat_vac_df.head()
| day | negative | neutral | positive | location | iso_code | date_x | total_vaccinations | people_vaccinated | people_fully_vaccinated | ... | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | year | date_with_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 348 | 2020-08-30 | 2444.0 | 7873.0 | 8055.0 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 781975 | 76.6 % | 1.8 % | 0.27 % | 2.65 | 0.047 | 348 | India | , 2020 | 2020-08-30 |
| 349 | 2020-08-29 | 2279.0 | 7421.0 | 7525.0 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 765302 | 76.6 % | 1.8 % | 0.26 % | 2.59 | 0.046 | 349 | India | , 2020 | 2020-08-29 |
| 356 | 2020-08-22 | 2273.0 | 7400.0 | 7489.0 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 707668 | 74.9 % | 1.9 % | 0.22 % | 2.23 | 0.041 | 356 | India | , 2020 | 2020-08-22 |
| 360 | 2020-08-18 | 2084.0 | 6730.0 | 6871.0 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 676549 | 73.6 % | 1.9 % | 0.2 % | 2.03 | 0.039 | 360 | India | , 2020 | 2020-08-18 |
| 361 | 2020-08-17 | 2040.0 | 6567.0 | 6701.0 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 673213 | 73.2 % | 1.9 % | 0.2 % | 1.98 | 0.038 | 361 | India | , 2020 | 2020-08-17 |
5 rows × 33 columns
min_max_scaler = preprocessing.MinMaxScaler()
for col in ['negative','neutral','positive','confirmed_cases',
'recovered_cases', 'fatal_cases', 'active_cases',
'total_vaccinations', 'people_fully_vaccinated']:
specific_country_sentiment_stat_vac_df[col] = min_max_scaler.fit_transform(specific_country_sentiment_stat_vac_df[[col]].values)
specific_country_sentiment_stat_vac_df.head()
/home/ali/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:400: RuntimeWarning: All-NaN slice encountered /home/ali/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:401: RuntimeWarning: All-NaN slice encountered /home/ali/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:400: RuntimeWarning: All-NaN slice encountered /home/ali/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:401: RuntimeWarning: All-NaN slice encountered
| day | negative | neutral | positive | location | iso_code | date_x | total_vaccinations | people_vaccinated | people_fully_vaccinated | ... | active_cases | recovery_rate | mortality_rate | affected_population | confirmed_per_1000 | died_per_1000 | index | country | year | date_with_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 348 | 2020-08-30 | 1.000000 | 1.000000 | 1.000000 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 1.000000 | 76.6 % | 1.8 % | 0.27 % | 2.65 | 0.047 | 348 | India | , 2020 | 2020-08-30 |
| 349 | 2020-08-29 | 0.932460 | 0.935225 | 0.926050 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 0.948815 | 76.6 % | 1.8 % | 0.26 % | 2.59 | 0.046 | 349 | India | , 2020 | 2020-08-29 |
| 356 | 2020-08-22 | 0.930004 | 0.932216 | 0.921027 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 0.771883 | 74.9 % | 1.9 % | 0.22 % | 2.23 | 0.041 | 356 | India | , 2020 | 2020-08-22 |
| 360 | 2020-08-18 | 0.852640 | 0.836199 | 0.834798 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 0.676350 | 73.6 % | 1.9 % | 0.2 % | 2.03 | 0.039 | 360 | India | , 2020 | 2020-08-18 |
| 361 | 2020-08-17 | 0.834630 | 0.812840 | 0.811079 | NaN | NaN | NaT | NaN | NaN | NaN | ... | 0.666109 | 73.2 % | 1.9 % | 0.2 % | 1.98 | 0.038 | 361 | India | , 2020 | 2020-08-17 |
5 rows × 33 columns
base_stats_fig = go.Figure()
for column in ['confirmed_cases', 'recovered_cases', 'fatal_cases', 'active_cases',
'total_vaccinations', 'people_fully_vaccinated',
'negative', 'neutral', 'positive']:
color_dict = {
"confirmed_cases": "#118ab2",
"recovered_cases": "#ef476f",
"fatal_cases": "#06d6a0",
"active_cases": "#073b4c",
"total_vaccinations": " #979a9a",
"people_fully_vaccinated": " #979a9a",
"negative": " #641e16",
"neutral": " #641e16",
"positive": " #641e16",
}
base_stats_fig.add_trace(
go.Scatter(
x = specific_country_sentiment_stat_vac_df['date_with_year'],
y = specific_country_sentiment_stat_vac_df[column],
name = column,
line = dict(color=color_dict[column]),
hovertemplate ='<br><b>Date</b>: %{x}'+'<br><i>Count</i>:'+'%{y}',
)
)
base_stats_fig.update_layout(
updatemenus=[
dict(
buttons=list(
[dict(label = 'All Cases',
method = 'update',
args = [{'visible': [True, True, True, True, True, True, True, True, True, True, True, True]},
{'title': 'All Cases',
'showlegend':True}]),
dict(label = 'active_cases',
method = 'update',
args = [{'visible': [False, False, False, True, False, False, False, False, False, False, False, False]},
{'title': 'active_cases',
'showlegend':True}]),
dict(label = 'confirmed_cases',
method = 'update',
args = [{'visible': [True, False, False, False, False, False, False, False, False, False, False, False]},
{'title': 'confirmed_cases',
'showlegend':True}]),
dict(label = 'recovered_cases',
method = 'update',
args = [{'visible': [False, True, False, False, False, False, False, False, False, False, False, False]},
{'title': 'recovered_cases',
'showlegend':True}]),
dict(label = 'fatal_cases',
method = 'update',
args = [{'visible': [False, False, True, False, False, False, False, False, False, False, False, False]},
{'title': 'fatal_cases',
'showlegend':True}]),
dict(label = 'total_vaccinations',
method = 'update',
args = [{'visible': [False, False, False, False, True, False, False, False, False, False, False, False]},
{'title': 'total_vaccinations',
'showlegend':True}]),
dict(label = 'people_fully_vaccinated',
method = 'update',
args = [{'visible': [False, False, False, False, False, True, False, False, False, False, False, False]},
{'title': 'people_fully_vaccinated',
'showlegend':True}]),
dict(label = 'negative',
method = 'update',
args = [{'visible': [False, False, False, False, False, False, True, False, False, False, False, False]},
{'title': 'negative',
'showlegend':True}]),
dict(label = 'neutral',
method = 'update',
args = [{'visible': [False, False, False, False, False, False, False, True, False, False, False, False]},
{'title': 'neutral',
'showlegend':True}]),
dict(label = 'positive',
method = 'update',
args = [{'visible': [False, False, False, False, False, False, False, False, True, False, False, False]},
{'title': 'positive',
'showlegend':True}]),
]),
type = "dropdown",
direction="down",
showactive=True,
x=0,
xanchor="left",
y=1.25,
yanchor="top"
),
dict(
buttons=list(
[dict(label = 'Linear Scale',
method = 'relayout',
args = [{'yaxis': {'type': 'linear'}},
{'title': 'All Cases',
'showlegend':True}]),
dict(label = 'Log Scale',
method = 'relayout',
args = [{'yaxis': {'type': 'log'}},
{'title': 'active_cases',
'showlegend':True}]),
]),
type = "dropdown",
direction="down",
showactive=True,
x=0,
xanchor="left",
y=1.36,
yanchor="top"
)
])
base_stats_fig.update_xaxes(showticklabels=False)
base_stats_fig.update_layout(
title_text="Statistics about Covid19 in {}".format(country), title_x=0.5, title_font_size=20,
legend=dict(orientation='h', yanchor='top', y=1.15, xanchor='right', x=1),
paper_bgcolor="mintcream",
xaxis_title="Date",
yaxis_title="# of Cases")
base_stats_fig.show()